{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "7b56ae70",
   "metadata": {},
   "source": [
    "### STAR fusion stringent QC "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "3ab169b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "from pathlib import Path "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "5da52810",
   "metadata": {},
   "outputs": [],
   "source": [
    "wkdir=\"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3\"\n",
    "wkdir_path = Path(wkdir)\n",
    "# inputs\n",
    "misexp_vrnt_feat_path = wkdir_path.joinpath(\"6_misexp_dissect/vrnt_features/misexp_vrnt_features.tsv\")\n",
    "misexp_vrnt_feat_df = pd.read_csv(misexp_vrnt_feat_path, sep=\"\\t\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c3cdc04e",
   "metadata": {},
   "outputs": [],
   "source": [
    "star_fusion_stringent_fusion_path = wkdir_path.joinpath(\"6_misexp_dissect/star_fusion/results_stringent_qc\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c79a7882",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of RNA IDs: 14\n"
     ]
    }
   ],
   "source": [
    "star_fusion_stringent_rna_path = wkdir_path.joinpath(\"6_misexp_dissect/star_fusion/star_fusion_rna_ids_max_sensitivity.txt\")\n",
    "star_fusion_stringent_rna = pd.read_csv(star_fusion_stringent_rna_path, sep=\"\\t\", header=None)[0].tolist()\n",
    "print(f\"Number of RNA IDs: {len(star_fusion_stringent_rna)}\")\n",
    "misexp_vrnt_feat_fusion_df = misexp_vrnt_feat_df[misexp_vrnt_feat_df.rna_id.isin(star_fusion_stringent_rna)]                                                     "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "ce98c15d",
   "metadata": {},
   "outputs": [],
   "source": [
    "misexp_tx_fusion_stringent_df_list = []\n",
    "for index,row in misexp_vrnt_feat_fusion_df.iterrows():\n",
    "    vrnt_id = row[\"vrnt_id\"]\n",
    "    misexp_gene_id = row[\"gene_id\"]\n",
    "    rna_id = row[\"rna_id\"]\n",
    "    # load STAR fusion predictions \n",
    "    star_fusion_predict_path = star_fusion_stringent_fusion_path.joinpath(f\"{rna_id}/FusionInspector-validate/finspector.FusionInspector.fusions.abridged.tsv\")\n",
    "    star_fusion_predict_df = pd.read_csv(star_fusion_predict_path, sep=\"\\t\")\n",
    "    # check if misexpressed gene in LeftGene\n",
    "    if not star_fusion_predict_df[star_fusion_predict_df.LeftGene.str.contains(misexp_gene_id)].empty:\n",
    "        misexp_fusion_df = star_fusion_predict_df[star_fusion_predict_df.LeftGene.str.contains(misexp_gene_id)].copy()\n",
    "        misexp_fusion_df[\"rna_id\"] = rna_id \n",
    "        misexp_fusion_df[\"vrnt_id\"] = vrnt_id \n",
    "        misexp_fusion_df[\"gene_id\"] = misexp_gene_id \n",
    "        misexp_tx_fusion_stringent_df_list.append(misexp_fusion_df)\n",
    "     # check if misexpressed gene in RightGene\n",
    "    elif not star_fusion_predict_df[star_fusion_predict_df.RightGene.str.contains(misexp_gene_id)].empty:\n",
    "        misexp_fusion_df = star_fusion_predict_df[star_fusion_predict_df.RightGene.str.contains(misexp_gene_id)].copy()\n",
    "        misexp_fusion_df[\"rna_id\"] = rna_id \n",
    "        misexp_fusion_df[\"vrnt_id\"] = vrnt_id \n",
    "        misexp_fusion_df[\"gene_id\"] = misexp_gene_id \n",
    "        misexp_tx_fusion_stringent_df_list.append(misexp_fusion_df)\n",
    "    else:\n",
    "        continue"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "4779002a",
   "metadata": {},
   "outputs": [],
   "source": [
    "misexp_tx_fusion_stringent_df = pd.concat(misexp_tx_fusion_stringent_df_list).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "dfe7e274",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of misexpression-associated SVs in cis to fusion event: 16\n",
      "Number of misexpression-associated SVs with consistent fusion events: 16\n"
     ]
    }
   ],
   "source": [
    "# check that fusion events occur in all samples carrying variant \n",
    "vrnts_consistent_fusion = []\n",
    "misexp_tx_fusion_vrnt_ids = misexp_tx_fusion_stringent_df.vrnt_id.unique()\n",
    "print(f\"Number of misexpression-associated SVs in cis to fusion event: {len(misexp_tx_fusion_vrnt_ids)}\")\n",
    "for vrnt_id in misexp_tx_fusion_vrnt_ids: \n",
    "    fusion_rna_ids = set(misexp_tx_fusion_stringent_df[misexp_tx_fusion_stringent_df.vrnt_id == vrnt_id].rna_id.unique())\n",
    "    misexp_rna_ids = set(misexp_vrnt_feat_df[misexp_vrnt_feat_df.vrnt_id == vrnt_id].rna_id.unique())\n",
    "    if fusion_rna_ids == misexp_rna_ids: \n",
    "        vrnts_consistent_fusion.append(vrnt_id)\n",
    "print(f\"Number of misexpression-associated SVs with consistent fusion events: {len(vrnts_consistent_fusion)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "a0043599",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of different fusion gene pairs (stringent QC): 10\n",
      "Number of different types of fusion event (stringent QC): 12\n",
      "Number of variant IDs where all carriers have a high evidence fusion: 16\n",
      "Number of fusion IDs where all carriers have a high evidence fusion: 12\n"
     ]
    }
   ],
   "source": [
    "fusion_stringent_names = misexp_tx_fusion_stringent_df['#FusionName'].unique()\n",
    "print(f\"Number of different fusion gene pairs (stringent QC): {len(fusion_stringent_names)}\")\n",
    "# some fusion events have the same name but involve different breakpoints \n",
    "misexp_tx_fusion_stringent_df[\"fusion_id\"] = misexp_tx_fusion_stringent_df[\"#FusionName\"] + \"|\" + misexp_tx_fusion_stringent_df[\"LeftBreakpoint\"] + \"|\" + misexp_tx_fusion_stringent_df[\"RightBreakpoint\"]\n",
    "fusion_ids_stringent = misexp_tx_fusion_stringent_df.fusion_id.unique()\n",
    "print(f\"Number of different types of fusion event (stringent QC): {len(fusion_ids_stringent)}\")\n",
    "\n",
    "# select fusion events that are consistent across all variant carriers \n",
    "misexp_tx_fusion_stringent_carriers_df = misexp_tx_fusion_stringent_df[[\"vrnt_id\", \"rna_id\", \"fusion_id\", \"LeftBreakpoint\", \"RightBreakpoint\"]].drop_duplicates()\n",
    "# count carriers \n",
    "misexp_tx_fusion_stringent_carriers_count_df = misexp_tx_fusion_stringent_carriers_df.groupby([\"vrnt_id\", \"fusion_id\", \"LeftBreakpoint\", \"RightBreakpoint\"], as_index=False).rna_id.count().rename(columns={\"rna_id\": \"carrier_count_fusion\"})\n",
    "# merge with number of carriers per misexpression-associated SV\n",
    "misexp_vrnt_carrier_df = misexp_vrnt_feat_df[[\"vrnt_id\", \"rna_id\"]].drop_duplicates()\n",
    "# count carriers \n",
    "misexp_vrnt_carrier_count_df = misexp_vrnt_carrier_df.groupby(\"vrnt_id\", as_index=False).rna_id.nunique().rename(columns={\"rna_id\": \"carrier_count_total\"})\n",
    "misexp_fusion_stringent_vrnt_carrier_df = pd.merge(misexp_vrnt_carrier_count_df, \n",
    "                                         misexp_tx_fusion_stringent_carriers_count_df, \n",
    "                                         on=\"vrnt_id\", how=\"inner\")\n",
    "# restrict to fusion events observed across all carriers \n",
    "misexp_fusion_stringent_vrnt_consistent_df = misexp_fusion_stringent_vrnt_carrier_df[misexp_fusion_stringent_vrnt_carrier_df.carrier_count_total == misexp_fusion_stringent_vrnt_carrier_df.carrier_count_fusion].copy()\n",
    "\n",
    "vrnt_id_consitent_fusion_stringent = misexp_fusion_stringent_vrnt_consistent_df.vrnt_id.unique()\n",
    "print(f\"Number of variant IDs where all carriers have a high evidence fusion: {len(vrnt_id_consitent_fusion_stringent)}\")\n",
    "fusion_id_consistent_stringent = misexp_fusion_stringent_vrnt_consistent_df.fusion_id.unique()\n",
    "print(f\"Number of fusion IDs where all carriers have a high evidence fusion: {len(fusion_id_consistent_stringent)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "cfd599f9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of different fusion gene pairs (stringent QC) in cis to variant: 10\n"
     ]
    }
   ],
   "source": [
    "# subset to variants only observed in cis to fusion event \n",
    "misexp_tx_fusion_stringent_consistent_df = misexp_tx_fusion_stringent_df[misexp_tx_fusion_stringent_df.vrnt_id.isin(vrnt_id_consitent_fusion_stringent)]\n",
    "fusion_names_consistent = misexp_tx_fusion_stringent_consistent_df[\"#FusionName\"].unique()\n",
    "print(f\"Number of different fusion gene pairs (stringent QC) in cis to variant: {len(fusion_names_consistent)}\") "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "352b4826",
   "metadata": {},
   "outputs": [],
   "source": [
    "# write variant-gene-sample pairs supported by fusion event \n",
    "fusion_vrnt_gene_smpl_df = misexp_tx_fusion_stringent_consistent_df[[\"vrnt_id\", \"gene_id\", \"rna_id\"]].drop_duplicates()\n",
    "# write to file \n",
    "fusion_vrnt_gene_smpl_path = star_fusion_stringent_fusion_path.joinpath(\"tx_fusion_vrnts_gene_smpl.tsv\")\n",
    "fusion_vrnt_gene_smpl_df.to_csv(fusion_vrnt_gene_smpl_path, sep=\"\\t\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "0d84e768",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total number of transcript fusion candidate variants: 16\n"
     ]
    }
   ],
   "source": [
    "print(f\"Total number of transcript fusion candidate variants: {len(vrnt_id_consitent_fusion_stringent)}\")\n",
    "tx_fusion_vrnts_path = star_fusion_stringent_fusion_path.joinpath(\"tx_fusion_vrnts_list.txt\")\n",
    "with open(tx_fusion_vrnts_path, \"w\") as f_out: \n",
    "    for vrnt_id in vrnt_id_consitent_fusion_stringent: \n",
    "        f_out.write(f\"{vrnt_id}\\n\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b1c80066",
   "metadata": {},
   "source": [
    "Examined FusionInspector results in IGV: \n",
    "\n",
    "### Deletions\n",
    "\n",
    "**Prioritised by fusion criteria pipeline**\n",
    "* BOD1L1--LINC01097\n",
    "    * Variant ID: DEL_chr4_13529219_13608506\n",
    "    * Gene ID: ENSG00000281202 (LINC01097) misexpressed \n",
    "    * RNA ID: INT_RNA7960028 \n",
    "\n",
    "* FBXO8--LINC02268 fusion \n",
    "    * 284739 (likely causal variant)\n",
    "        * Three other variants in cis window: \n",
    "            * 284735 intronic variant in both genes \n",
    "            * DEL_chr4_174181857_174193905 intronic variant ENSG00000248174, no predicted effect ENSG00000250957\n",
    "            * DEL_chr4_174236814_174243134 upstream variant (not tx readthrough prioritised)\n",
    "    * 2 genes misexpressed: ENSG00000248174 & ENSG00000250957\n",
    "        * ENSG00000250957 is located inside the other gene ENSG00000248174 (LINC02268). \n",
    "        * ENSG00000250957 misexpressed via transcriptional readthrough. \n",
    "    * 2 supporting reads for fusion transcript \n",
    "    * Variant prioritised in both fusion transcript workflow and transcriptional readthrough pipeline \n",
    "        * ENSG00000248174 misexpressed via fusion \n",
    "        * ENSG00000250957 misexpress via readthrough \n",
    "\n",
    "**Unclear mechanism**\n",
    "* CPPED1--AC010333.2\n",
    "    * RNA ID: INT_RNA7960040\n",
    "    * Variant ID: 158491\n",
    "        * Intronic variant in the misexpressed gene involved in fusion\n",
    "    * Two genes misexpressed at locus: ENSG00000259876 (novel gene) & ENSG00000259899 (novel gene)\n",
    "    * ENSG00000259899 is involved in fusion \n",
    "    * Fusion transcript linking 3rd exon to last exon - 4 reads spanning splice site \n",
    "    * Potential cryptic splicing event generating novel splice site (not possible to confirm)\n",
    "    \n",
    "* MAP2K3--LINC01563\n",
    "    * Variant ID: 173184\n",
    "        * Affects one gene in one sample \n",
    "    * Gene ID: ENSG00000236819 (LINC01563)\n",
    "    * RNA ID: INT_RNA7960873\n",
    "    * Two different fusion events, hence duplicated in file\n",
    "    * This has not been picked up by DEL fusion prioritisation because:\n",
    "        * Upstream gene is USP22, it is on the wrong strand to form fusion product \n",
    "        * LINC01563 is on + strand as is MAP2K3, however it is upstream of MAP2K3, unclear how transcript fusion is generated \n",
    "    * Nice example for deletion mechanism - 34 reads spanning breakpoint\n",
    "    * Appears to be fusion of the 5'UTR of MAP2K3 to the 3' UTR of LINC01563 - novel non-coding transcript\n",
    "       \n",
    "* TAF1C--CDH13\n",
    "    * Variant ID: DEL_chr16_83921765_83922967, DEL_chr16_83997914_83999941, DEL_chr16_83966491_84032889\n",
    "        * All variants affect same gene in same sample \n",
    "    * RNA ID: INT_RNA7959485\n",
    "    * Gene ID: ENSG00000140945\n",
    "    * Only one read supporting fusion transcript and no long anchor support\n",
    "    * Multiple variants, unclear which is causal if any\n",
    "    \n",
    "**Readthrough and intergenic splicing**\n",
    "* ST6GAL1--RTP1 - transcriptional readthrough with intergenic splicing \n",
    "    * Variant IDs: DEL_chr3_187079365_187083300 and DEL_chr3_187069321_187094542\n",
    "        * DEL_chr3_187069321_187094542 is associated with misexpression of two genes: ENSG00000175077 & ENSG00000283175 - one is further away, mechanism is unclear \n",
    "        * Variant pair only found in one sample, one is prioritised by tx readthrough while other is not\n",
    "    * Gene ID: ENSG00000175077 (RTP1)\n",
    "    * RNA ID: INT_RNA7710961\n",
    "\n",
    "* TBCA--OTP - 293588 - transcriptional readthrough with intergenic splicing \n",
    "    * Variant ID: 293588\n",
    "        * Only affects one gene in one sample \n",
    "    * RNA_ID: INT_RNA7710599\n",
    "    * Gene ID: ENSG00000171540\n",
    "    * Transcriptional readthrough event with intergenic fusion \n",
    "\n",
    "### Duplications\n",
    "* STON2--LINC02308 - previously characterised DUP leading to Tx fusion\n",
    "    * 2 samples: INT_RNA7710877 & INT_RNA7710267\n",
    "    * Variant ID: 397951 (present in both samples) \n",
    "        * Only associated with one misexpressed gene \n",
    "    * ENSG00000258675 (LINC02308)\n",
    "* KARS--CPHXL - previously chracterised DUP leading to Tx fusion\n",
    "    * RNA ID: INT_RNA7710560\n",
    "    * Variant ID: 401916 \n",
    "        * Only affects one gene in one sample \n",
    "    * Gene ID: ENSG00000283755 (CPHXL)\n",
    "* AC005747.1--MYH1 - previously characterised DUP leading to Tx fusion\n",
    "    * RNA ID: INT_RNA7961048\n",
    "    * Variant ID: 402648\n",
    "        * Only affects one gene in one sample \n",
    "    * Gene ID: ENSG00000109061 (MYH1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d326454e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
